https://drive.google.com/open?id=1OZNJm81JXucV3HmZroMq6qCT2m7ez7IJ
from google.colab import drive
drive.mount('/content/drive')
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.rcParams.update({'font.size': 14})
RANDOM_STATE = 41
LOAD_FROM_FILE = 1
datasetPath = '/content/drive/My Drive/GL/Dataset/'
filePath = '/content/drive/My Drive/GL/Capstone/'
df = pd.read_excel(datasetPath + 'input_data.xlsx')
data = df.copy()
print("Original Data Shape:", df.shape)
df.head(25)
AssignmentGroup = "Assignment group"
Description = "Description"
ShortDescription = "Short description"
df.info()
The dataset has 8500 rows with 4 columns & all are of object data type. Looking at the data we can infer the following points
# Dropping Caller Column
df = df.drop (columns='Caller', axis = 1)
df.head(25)
def SetHorizontalBarValues(ax):
# create a list to collect the plt.patches data
totals = []
# find the values and append to list
for i in ax.patches:
totals.append(i.get_width())
# set individual bar lables using above list
total = sum(totals)
# set individual bar lables using above list
for i in ax.patches:
# get_width pulls left or right; get_y pushes up or down
ax.text(i.get_width()+10, i.get_y()+.46, \
str(round((i.get_width()/total)*100, 2))+'%')
# invert for largest on top
ax.invert_yaxis()
def SetVerticalBarValues(ax, hgt_spacing):
# create a list to collect the plt.patches data
totals = []
# find the values and append to list
for i in ax.patches:
totals.append(i.get_height())
# set individual bar lables using above list
total = sum(totals)
# set individual bar lables using above list
for i in ax.patches:
# get_x pulls left or right; get_height pushes up or down
ax.text(i.get_x()-.03, i.get_height() + hgt_spacing, \
str(round(i.get_height(), 2)), rotation=45)
def MultiClassDataDistribution(col):
print('Total no. of unique values are ' + str(col.nunique()))
plt.subplot(2, 1, 1)
freq = dict(col.value_counts())
data_df = pd.DataFrame.from_dict(freq, orient='index', columns=['Frequency'])
fr = data_df['Frequency'].plot(kind='bar', figsize=(18,7), rot=60)
SetVerticalBarValues(fr, 5)
plt.title('Class Frequency Distribution', y= 1.1)
plt.subplot(2, 1, 2)
data_df['Percentage'] = (data_df['Frequency'] / data_df['Frequency'].sum()) * 100
pr = data_df['Percentage'].plot(kind='bar', figsize=(18,7), rot=60)
SetVerticalBarValues(pr, 0.05)
plt.title('Class Percentage Distribution', y= 1.1)
plt.tight_layout()
plt.show()
MultiClassDataDistribution(df[AssignmentGroup])
Inference
# Merge the groups with small entires to MISC group
grpList = df.groupby([AssignmentGroup])
regroup=[]
for grp in df[AssignmentGroup].unique():
if(grpList.get_group(grp).shape[0] < 25):
regroup.append(grp)
print('Found {} groups which have under 25 samples'.format(len(regroup)))
df[AssignmentGroup] = df[AssignmentGroup].apply(lambda x : 'MISC' if x in regroup else x)
df[AssignmentGroup].unique()
MultiClassDataDistribution(df[AssignmentGroup])
duplicate_df = df[df.duplicated()]
duplicate_df.head(25)
duplicate_df[AssignmentGroup].value_counts()
df = df.drop_duplicates([ShortDescription, Description, AssignmentGroup])
print("Data Shape after dropping Duplicate Values:", df.shape)
df.isnull().sum()
df[df.isnull().any(axis=1)]
df[ShortDescription].replace(np.NaN, ' ', inplace=True)
df[Description].replace(np.NaN, ' ', inplace=True)
print("Data Shape after dropping Missing Values:", df.shape)
df.isnull().sum()
import nltk
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist
def ShowWordCloud(data):
words = []
for val in data:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = str(tokens[i].lower())
words += tokens
freq_words = nltk.FreqDist(words)
words = dict([(m, n) for m, n in freq_words.items()])
wordcloud = WordCloud(width=1024, height=786,
background_color='black',
random_state = RANDOM_STATE,
colormap="Blues",
collocations=False)
wordcloud.generate_from_frequencies(words)
plt.figure(figsize = (20,10))
plt.axis("off")
plt.imshow(wordcloud, interpolation="bilinear")
plt.show()
TopNWordsInText(data)
def TopNWordsInText(data, n = 20):
vectorizer = CountVectorizer(ngram_range = (1, 1), analyzer = 'word')
matrix = vectorizer.fit_transform(data)
freq = sum(matrix).toarray()[0]
freq_df = pd.DataFrame(freq, index=vectorizer.get_feature_names(), columns=['Frequency'])
top_n_df = freq_df.sort_values(by = "Frequency", ascending = False).head(n)
ax = top_n_df.head(n).plot(figsize=(12,8), kind="barh", title="Top " + str(n) + " Frequent Words")
SetHorizontalBarValues(ax)
ShowWordCloud(df[ShortDescription])
The graph indicates that the top words in Short Description are stop words i.e to, in, at, on, is etc. The other prominent words are job, job_scheduler, failed, erp etc
ShowWordCloud(df[Description])
Inference:
Looking at the data we could find that both short description as well as description gives some information about the ticket. In some cases both are same & in some cases short description has proper text & description doesn't have any meaningful text. Hence, we can combine Short Description & Description as one column.
df["Full Description"] = df[ShortDescription].map(str) + ' ' + df[Description].map(str)
df.head(25)
Due to our last activity of combining columns, we can see that there are duplicate words in the new Full Description column. Hence we will delete the "duplicates words" in Full Description column
OriginalDescription = "Original Description"
# Deleting the duplicates words
df["Full Description"] = df["Full Description"].apply(lambda x:' '.join(pd.unique(x.split())))
# Copying the data frame with Merged Description
merged_df = df.copy()
df.to_excel("Combined-Text-Features-Data.xlsx", index=False)
# Pick required column & Rename to Description for further processing
df = df[['Full Description', AssignmentGroup]]
df.rename(columns={"Full Description": OriginalDescription}, inplace=True)
merged_df.head(25)
ShowWordCloud(df[OriginalDescription])
Inference
Pre-processing is mandatory to get the data in a consistent format. We will create a function which will perform the following tasks on the text columns:
Combined Description column has many unicode characters. We can fix this using the python library ftfy.
!pip install ftfy
import ftfy
df["Decoded Description"] = df[OriginalDescription].apply(lambda x: ftfy.fix_text(x))
# Copying the data frame with Decoded Description
fixed_df = df.copy()
df.to_excel("Unicode-Fixed-Data.xlsx", index=False)
# Pick required column & Rename to Description for further processing
df = df[['Decoded Description', AssignmentGroup, OriginalDescription]]
df.rename(columns={"Decoded Description": Description}, inplace=True)
fixed_df.head(25)
Dataset is cleaned from all the Unicode characters and the same is saved in an excel file "Unicode-Fixed-Data.xlsx" for future reference.
The Combined Description has text from different languages. We can fix this using the Google Translate API in python.
!pip install google_trans_new
from google_trans_new import google_translator
translator = google_translator()
def DetectLanguage(row):
try:
language = detect(row)
except:
language = "en"
return language
def TranslateLanguage(row):
try:
result = translator.translate(row, lang_tgt='en')
except:
print(row)
result = row
return result
# INTRODUCED LOAD_FROM_FILE FLAG SINCE IT IS TIME INTENSIVE AND TAKES AROUND 10 MINUTES TO TRANSLATE EACH WORD IN THE DESCRIPTION COLUMN
# HAVE STORED THE DATA IN EXCEL. WILL LOAD THAT IN ORDER TO SAVE COMPUTATION TIME
if LOAD_FROM_FILE == 0 :
df["Translated Description"] = df[Description].apply(lambda x: TranslateLanguage(x))
else:
df = pd.read_excel(datasetPath + 'Translated-Data.xlsx')
df = df.dropna(how='any',axis=0)
# Copying the data frame with Translated Description
translated_df = df.copy()
df.to_excel("Translated-Data.xlsx", index=False)
%cp /content/Translated-Data.xlsx /content/drive/My\ Drive/GL/Capstone
# Pick required column & Rename to Description for further processing
df = df[['Translated Description', AssignmentGroup, OriginalDescription]]
df.rename(columns={"Translated Description": Description}, inplace=True)
print("Data Shape:", translated_df.shape)
translated_df.head(25)
Dataset is translated from other languages to English language and the same is saved in an excel file "Translated-Data.xlsx" for future reference. Still there are many non english words which do not make any sense hence will clean them before processing
Even after Language Translation we can see words that do not make any sense. Thus, installing the nostril library to detect such jitted words and remove whatever is possible from the dataset. Thus ensuring less overall Word Vocabulary size
!sudo pip3 install git+https://github.com/casics/nostril.git
!pip install clean-text[gpl]
import re
from nostril import nonsense
from cleantext import clean
def CleanSentence(sentence):
# Adding regex list as per the given data set to flush off the unnecessary text
m = clean(sentence,
fix_unicode=True, # fix various unicode errors
to_ascii=True, # transliterate to closest ASCII representation
lower=True, # lowercase text
no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
no_urls=True, # replace all URLs with a special token
no_emails=True, # replace all email addresses with a special token
no_phone_numbers=True, # replace all phone numbers with a special token
no_numbers=True, # replace all numbers with a special token
no_digits=True, # replace all digits with a special token
no_currency_symbols=True, # replace all currency symbols with a special token
no_punct=True, # remove punctuations
replace_with_punct=" ", # instead of removing punctuations you may replace them
replace_with_url="",
replace_with_email="",
replace_with_phone_number="",
replace_with_number="",
replace_with_digit="",
replace_with_currency_symbol="",
lang="en" # set to 'de' for German special handling
)
m = ' '.join([w for w in m.split() if len(w) <= 15])
m = re.sub('from:(.*)\\r\\n', '', m) # from line
m = re.sub('sent:(.*)\\r\\n', '', m) # sent to line
m = re.sub('received from:(.*)\\r\\n', '', m) # received data line
m = re.sub('received', '', m) # received data line
m = re.sub('to:(.*)\\r\\n', '', m) # to line
m = re.sub('cc:(.*)\\r\\n', '', m) # cc line
m = re.sub('(.*)infection', '', m) # footer
m = re.sub('\\[cid:(.*)]', '', m) # images cid
m = re.sub('https?:[^\\]\\n\\r]+', '', m) # https & http
m = re.sub('subject: ', '', m) # Subject word
m = re.sub('[\\w\\d\\-\\_\\.]+@[\\w\\d\\-\\_\\.]+', '', m) # emails are not required
m = re.sub('[0-9][\\-0–90-9 ]+', '', m) # phones are not required
m = re.sub('[0-9]', '', m) # numbers not needed
m = re.sub('[^a-zA-z 0-9]+', '', m) # anything that is not a letter
m = re.sub('[\\r\\n]', '', m) # \\r\\n
m = RemoveNonsenseWords(m)
m = re.sub(' [a-zA-Z] ', ' ', m) # single letters makes no sense
return ' '.join(m.split())
def RemoveNonsenseWords(sentence):
words = []
for w in sentence.split():
try:
if not nonsense(w):
words.append(w)
except:
words.append(w)
return ' '.join([str(elem) for elem in words])
# INTRODUCED LOAD_FROM_FILE FLAG SINCE IT IS TIME INTENSIVE AND TAKES AROUND 40 MINUTES TO CLEAN EACH WORD IN THE DESCRIPTION COLUMN
# HAVE STORED THE DATA IN EXCEL. WILL LOAD THAT IN ORDER TO SAVE COMPUTATION TIME
if LOAD_FROM_FILE == 0 :
df["Clean Description"] = df[Description].apply(lambda x: CleanSentence(x))
else:
df = pd.read_excel(datasetPath + 'Cleaned-Data.xlsx')
df = df.dropna(how='any',axis=0)
# Copying the data frame with Cleaned Description
cleaned_df = df.copy()
df.to_excel('Cleaned-Data.xlsx', index=False)
%cp /content/Cleaned-Data.xlsx /content/drive/My\ Drive/GL/Capstone
# Pick required column & Rename to Description for further processing
df = df[['Clean Description', AssignmentGroup, OriginalDescription]]
df.rename(columns={"Clean Description": Description}, inplace=True)
print("Data Shape:", cleaned_df.shape)
cleaned_df.head(25)
def ReplaceWords(m):
m = m.replace('aerp', 'asap')
m = m.replace('amerirtca', 'america')
m = m.replace('rtr', 'router')
m = m.replace('sid', 'id')
m = m.replace('wi fi', 'wifi')
m = m.replace('log in', 'login')
m = m.replace('can not', 'cannot')
m = m.replace('add in', 'addin')
m = re.sub(' [a-zA-Z] ', ' ', m) # single letters makes no sense
return ' '.join(m.split())
df["Clean Description"] = df[Description].apply(lambda x: ReplaceWords(x))
df = df.dropna(how='any',axis=0)
# Copying the data frame with Cleaned Description
cleaned_df = df.copy()
df.to_excel('Final-Cleaned-Data.xlsx', index=False)
# Pick required column & Rename to Description for further processing
df = df[['Clean Description', AssignmentGroup, OriginalDescription]]
df.rename(columns={"Clean Description": Description}, inplace=True)
cleaned_df.head(25)
ShowWordCloud(df[Description])
!pip install spacy
import spacy
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
def LemmatizeSentence(sentence):
doc = nlp(sentence)
return " ".join([token.text if token.lemma_ == "-PRON-" else token.lemma_ for token in doc])
# INTRODUCED LOAD_FROM_FILE FLAG SINCE IT TAKES TIME TO LEMMATIZE EACH WORD IN THE DESCRIPTION COLUMN
# HAVE STORED THE DATA IN EXCEL. WILL LOAD THAT IN ORDER TO SAVE COMPUTATION TIME
if LOAD_FROM_FILE == 0 :
df["Lemmatized Description"] = df[Description].apply(lambda x: LemmatizeSentence(x))
else:
df = pd.read_excel(datasetPath + 'Lemmatized-Data.xlsx')
df = df.dropna(how='any',axis=0)
# Copying the data frame with Lemmatized Description
lemmatized_df = df.copy()
df.to_excel('Lemmatized-Data.xlsx', index=False)
%cp /content/Lemmatized-Data.xlsx /content/drive/My\ Drive/GL/Capstone
# Pick required column & Rename to Description for further processing
df = df[['Lemmatized Description', AssignmentGroup, OriginalDescription]]
df.rename(columns={"Lemmatized Description": Description}, inplace=True)
print("Data Shape:", lemmatized_df.shape)
lemmatized_df.head(25)
ShowWordCloud(df[Description])
Word Cloud has not changed that much after Lemmatization. The dataset continue to have stop words occuring more frequently.
all_stopwords = nlp.Defaults.stop_words
if 'not' in all_stopwords:
all_stopwords.remove('not')
if 'call' in all_stopwords:
all_stopwords.remove('call')
if 'out' in all_stopwords:
all_stopwords.remove('out')
all_stopwords |= {"hi","pm","cc","usa","ve","abc","xyz","st","th","al","et","mx","cr","conn","jco","tm","na","g","sv","ef","inc","wy","cee","tel","tys","ae","aw","pi","ic","pl","pls","sr","zz" \
"fi","fw","ref","rak","ltm","m","mm","d","b","a","r","c","ar","eh","pu","xd","oh","oooh","o","s"}
def RemoveStopWords(sentence):
doc = nlp(sentence)
return " ".join([word.text for word in doc if not word.text in all_stopwords])
df["Processed Description"] = df[Description].apply(lambda x: RemoveStopWords(x))
# Copying the data frame with Lemmatized Description
processed_df = df.copy()
df.to_excel('Stop-Word-Removed-Data.xlsx', index=False)
# Pick required column & Rename to Description for further processing
df = df[['Processed Description', AssignmentGroup, OriginalDescription]]
df.rename(columns={"Processed Description": Description}, inplace=True)
processed_df.head(25)
ShowWordCloud(df[Description])
import nltk
nltk.download('words')
nltk.download('brown')
nltk.download('webtext')
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import webtext
all_words = set(nltk.corpus.words.words() + brown.words() + webtext.words())
exclusion_list = ['erp', 'scheduler', 'telecom', 'vpn', 'crm', 'skype', 'gsc', 'verizon', 'inwarehouse', 'inplant', 'screenshot', 'apac', 'iphone', 'teamviewer', 'wifi', 'hrp', 'plm', 'netweaver', \
'dc', 'dsw', 'workflow', 'sr', 'bex', 'gb', 'gso', 'hp', 'ewew', 'dmvpn', 'mpls', 'cvss', 'etime', 'vmax', 'vlan', 'sms', 'rpc', 'inbox', 'activesync', 'citrix', 'router', 'hr', \
'hostname', 'inplant', 'payslip', 'ip', 'ess', 'folder', 'pc', 'dlv', 'mm', 'sw', 'pwd', 'ave', 'cert', 'pdf', 'stack', 'cadagent', 'payroll', 'ie', 'tab', 'acrobat', 'sql', 'prod', \
'java', 'cpu', 'mms', 'laptop', 'datum', 'bkwin', 'app', 'kiosk', 'reset', 'eu', 'hub', 'website', 'qa', 'mfg', 'admin', 'reroute', 'browser', 'esrs', 'fwd', 'msg', 'dialog', 'asap', \
'lan', 'dept', 'cnc', 'vip', 'prs', 'faq', 'sso', 'hrm', 'pricing', 'wireless', 'meter', 'hcm', 'misplace', 'setup', 'quarantine', 'spamme', 'robot', 'unblock', 'scm', 'api', 'lotus', \
'chg', 'zdis', 'hrtool', 'ticketno', 'hostname', 'addin', 'hpqc', 'infopath', 'ipad', 'mii']
Spelling Corrector Functions
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open(datasetPath + 'big.txt').read()))
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
from google_trans_new import google_translator
translator = google_translator()
count = 0
def TranslateWords(row):
global count
count += 1
if(count % 500 == 0):
print(count, "rows translated.")
words = []
for w in nltk.wordpunct_tokenize(row):
w = w.lower()
if w in all_words or w in exclusion_list or not w.isalpha():
wrd = w
else:
try:
wrd = translator.translate(w, lang_tgt='en')
except:
wrd = w
wrd = correction(wrd)
words.append(wrd)
return ' '.join([str(elem) for elem in words])
# INTRODUCED LOAD_FROM_FILE FLAG SINCE IT IS TIME INTENSIVE AND TAKES AROUND 20-25 MINUTES TO TRANSLATE EACH WORD IN THE DESCRIPTION COLUMN
# HAVE STORED THE DATA IN EXCEL. WILL LOAD THAT IN ORDER TO SAVE COMPUTATION TIME
if LOAD_FROM_FILE == 0 :
df["Translated Description"] = df[Description].apply(lambda x: TranslateWords(x))
else:
df = pd.read_excel(datasetPath + 'Translated-Words-Master.xlsx')
df = df.dropna(how='any',axis=0)
# Copying the data frame with Translated Description
translated_w_df = df.copy()
df.to_excel("Translated-Words-Master.xlsx", index=False)
%cp /content/Translated-Words-Master.xlsx /content/drive/My\ Drive/GL/Capstone
# Pick required column & Rename to Description for further processing
df = df[['Translated Description', AssignmentGroup, OriginalDescription]]
df.rename(columns={"Translated Description": Description}, inplace=True)
print("Data Shape:", translated_w_df.shape)
translated_w_df.head(25)
ShowWordCloud(df[Description])
wrd = []
def GetNonEnglishWords(sent):
for w in nltk.wordpunct_tokenize(sent):
w = w.lower()
if w in all_words or not w.isalpha():
continue
else:
wrd.append(w)
df[Description].apply(lambda x: GetNonEnglishWords(x))
import collections
c = collections.Counter(wrd)
tmp = pd.DataFrame.from_dict(c, orient='index').reset_index()
tmp = tmp.rename(columns={'index':'word', 0:'count'})
tmp.to_excel("Words.xlsx")
from collections import OrderedDict
def RemoveNonEnglishWords(sent):
wrds=[]
for w in nltk.wordpunct_tokenize(sent):
w = w.lower()
if len(w) > 1 and (w in all_words or w in exclusion_list or not w.isalpha()):
wrds.append(w)
return ' '.join(OrderedDict().fromkeys(wrds))
df["Clean Description"] = df[Description].apply(lambda x: RemoveNonEnglishWords(x))
# Copying the data frame with English Description
english_df = df.copy()
df.to_excel('English-Data.xlsx', index=False)
%cp /content/English-Data.xlsx /content/drive/My\ Drive/GL/Capstone
# Pick required column & Rename to Description for further processing
df = df[['Clean Description', AssignmentGroup]]
df.rename(columns={"Clean Description": Description}, inplace=True)
english_df.head(25)
df = english_df
df = df[['Clean Description', AssignmentGroup]]
df.rename(columns={"Clean Description": Description}, inplace=True)
temp_df = df[df[AssignmentGroup] == 'GRP_0'].reset_index(drop=True)
print("Shape of Group 0 dataset:", temp_df.shape)
def UpdateDescription(frame, idxs, value):
for idx in idxs:
frame[Description][idx] = value
return frame
def ReplaceRedundantData(frame, filterlist, matchlist, replacetext):
res = frame
if len(filterlist) > 0:
filterpattern = '|'.join(filterlist)
idxs = frame[frame[Description].str.contains(filterpattern)].index
res = UpdateDescription(frame, idxs, replacetext)
if len(matchlist) > 0:
matchpattern = '|'.join(matchlist)
idxs = frame[frame[Description].str.fullmatch(matchpattern)].index
res = UpdateDescription(frame, idxs, replacetext)
return res
temp_df = ReplaceRedundantData(temp_df,
['ad account lock', 'ad password reset', 'ad lock password', 'account lock ad', 'unlock ad account'],
['ad lock', 'password reset ad', 'active directory lock'],
'ad active directory account lock need unlock password reset')
temp_df = ReplaceRedundantData(temp_df,
['account lock erp', 'account unlock erp', 'erp account lock', 'erp account lockout', 'erp account unlock', 'erp lock' ],
[], 'erp account lock need unlock windows')
temp_df = ReplaceRedundantData(temp_df,
['erp password block', 'erp password forget', 'erp password lock', 'erp password reset' ], \
[], 'erp password reset lock request need unlock windows user')
temp_df = ReplaceRedundantData(temp_df,
['account disable', 'account unlock request', 'account lock issue', 'account lock hello team', 'account lock frequently', \
'frequent account lockout', 'frequent account lock user', 'account lock release'], \
['account lockout', 'account unlock', 'account lock', 'account unlock not', 'account lock unlock', 'account lock window', 'frequent account lock'],
'account lock need unlock windows computer')
temp_df = ReplaceRedundantData(temp_df, ['account lock password', 'account password summary'], [], 'account lock password reset expire')
temp_df = ReplaceRedundantData(temp_df, ['blank call'], [], 'blank gso call loud noise disconnect')
temp_df = ReplaceRedundantData(temp_df, ['call check account'], [], 'call check account disable active')
temp_df = ReplaceRedundantData(temp_df, ['ie browser issue'], ['browser issue'], 'ie browser issue website not load')
temp_df = ReplaceRedundantData(temp_df, ['audio not work', 'audio issue', 'sound issue'], ['sound not work'], \
'audio laptop not work pc tablet sound driver issue skype service call contact teamviewer connect setting')
temp_df = ReplaceRedundantData(temp_df, ['attendance tool password', 'attendance tool login'], [], 'attendance tool password forget reset need help')
temp_df = ReplaceRedundantData(temp_df, ['add member', 'add share'], [], 'add member share distribution mailbox group outlook')
temp_df = ReplaceRedundantData(temp_df, ['reset password erp production'], [], 'reset password erp production unlock unable lock hcm login account user')
temp_df = ReplaceRedundantData(temp_df, [],
['collaboration platform not', 'collaboration platform not open', 'collaboration platform not available internet access', \
'browser issue collaboration platform not load', 'collaboration platform issue', 'collaboration platform query', \
'collaboration platform site not open'],
'collaboration platform not available open load browser internet issue query')
temp_df = ReplaceRedundantData(temp_df, ['email not open desktop', 'email not update'],
['email morning kind', 'email not work'], 'email outlook not work')
temp_df = ReplaceRedundantData(temp_df, [],
['engineering tool loin', 'engineering tool not open', 'engineering tool not work', 'engineering tool page not open'],
'engineering tool not open work')
temp_df = ReplaceRedundantData(temp_df, ['mobile device activation', 'need activate new'], [], 'mobile device activation request new phone company personal')
temp_df = ReplaceRedundantData(temp_df, ['ms crm dynamics', 'ms dynamics'], ['ms outlook issue crm dynamics', 'outlook issue ms crm dynamics'], 'ms crm dynamics outlook issue error')
temp_df = ReplaceRedundantData(temp_df, ['ms office installation'], [], 'ms office installation')
temp_df = ReplaceRedundantData(temp_df, ['need change password'], ['need help password reset', 'need password reset'], 'need help password change reset')
temp_df = ReplaceRedundantData(temp_df, ['need help change password management', 'need help reset password management'], [], 'need help change reset password management')
temp_df = ReplaceRedundantData(temp_df, ['outlook freeze', 'outlook not respond crm'],
['outlook freezes freeze', 'outlook freezing issue', 'outlook hangs not open', 'outlook issue not load'], 'outlook freeze crm hangs not respond')
temp_df = ReplaceRedundantData(temp_df, ['outlook not connect'],
['outlook not function', 'outlook not launch', 'outlook not open', 'outlook not load', 'outlook not start'], 'outlook not open load connect')
temp_df = ReplaceRedundantData(temp_df, ['password activate', 'pass word reset', 'reset user password'],
['password reset expire', 'reset password account', 'reset password user', 'reset password userid'], 'password activate reset expire')
temp_df = ReplaceRedundantData(temp_df, ['password management tool manager'], [], 'password management tool manager unable reset help query')
temp_df = ReplaceRedundantData(temp_df, ['reset microsoft online services password'], [], 'request reset microsoft online services password')
temp_df = ReplaceRedundantData(temp_df, ['request erp password'], [], 'request erp password')
temp_df = ReplaceRedundantData(temp_df, ['request erp user'], [], 'request erp user account password')
temp_df = ReplaceRedundantData(temp_df, ['request passoword erp qa'], [], 'request passoword erp qa hcm login')
temp_df = ReplaceRedundantData(temp_df, ['reset passoword window', 'reset windows password'], [], 'request passoword windows login unlock')
temp_df = ReplaceRedundantData(temp_df, ['supply chain software'], [], 'supply chain software password reset unable access')
temp_df = ReplaceRedundantData(temp_df, ['survey related ticket'], [], 'survey related ticket not work')
temp_df = ReplaceRedundantData(temp_df, ['unable access email'], [], 'unable access email outlook')
temp_df = ReplaceRedundantData(temp_df, ['urgent help require outlook crm mfg'], [], 'urgent help require outlook crm mfg issue')
temp_df = ReplaceRedundantData(temp_df, ['windows account'], [], 'windows account unlock reset password')
temp_df = ReplaceRedundantData(temp_df, ['windows password'], [], 'windows password expire reset request management tool')
# Drop Duplicate rows from data
temp_df = temp_df.drop_duplicates([Description, AssignmentGroup])
print('Shape of GRP_0 data', temp_df.shape)
df = pd.concat([temp_df, df[df[AssignmentGroup] != 'GRP_0']])
df.reset_index(inplace=True, drop=True)
print('Shape of complete data', df.shape)
df["Processed Description"] = df[Description].apply(lambda x: RemoveStopWords(x))
# Copying the data frame with Lemmatized Description
processed_df = df.copy()
df.to_excel('Final-Stop-Word-Removed-Data.xlsx', index=False)
# Pick required column & Rename to Description for further processing
df = df[['Processed Description', AssignmentGroup]]
df.rename(columns={"Processed Description": Description}, inplace=True)
processed_df.head(25)
df = df[df[Description].str.split().str.len() > 2]
df.shape
duplicate_df = df[df.duplicated()]
duplicate_df[AssignmentGroup].value_counts()
df = df.drop_duplicates([Description, AssignmentGroup])
print("Data Shape after dropping Duplicate Values:", df.shape)
df = df.dropna(how='any',axis=0)
print("Data Shape after dropping Missing Values:", df.shape)
df.isnull().sum()
ShowWordCloud(df[Description])
MultiClassDataDistribution(df[AssignmentGroup])
# Final Data after Text Pre-Processing
final_df = df.copy()
X = df[Description]
y = df[AssignmentGroup]
df.info()
final_df.to_excel('Final-Processed-Data.xlsx', index=False)
%cp /content/Final-Processed-Data.xlsx /content/drive/My\ Drive/GL/Capstone
Text Pre-Processing task is completed. Now let us start with Data Resampling to deal with the imbalance in the dataset
We are going to perform 2 steps to logically split the data for further processing
So, now we have 2 separate datasets on which the model will execute and the prediction will be done.
The Hypertuned Model will evaluate the first dataset i.e. GRP_0 vs Others and predict the output.
df_others = df[df[AssignmentGroup] != 'GRP_0']
print("Other Groups Dataset Shape:", df_others.shape)
IsGroup0 = 'Is_GRP_0'
df[IsGroup0] = df[AssignmentGroup].apply(lambda x: 'GRP_0' if x == 'GRP_0' else 'Others')
df.info()
MultiClassDataDistribution(df[IsGroup0])
df_group0 = df[df[IsGroup0] == 'GRP_0']
ShowWordCloud(df_group0[Description])
MultiClassDataDistribution(df_others[AssignmentGroup])
ShowWordCloud(df_others[Description])
The Logical Split has facilitated us with segregation of data but still the data is imbalanced. The second dataset i.e. Other groups one is very skewed and we need to perform resampling of the data. We are going to perform the following steps
from sklearn.utils import resample
def ResampleDataset(ds, size, col):
ds_sampled = pd.DataFrame(columns=[Description, col])
for grp in ds[col].unique():
grp_df = ds[ds[col] == grp]
sampled = resample(grp_df, replace=True, n_samples=int(size), random_state=RANDOM_STATE)
ds_sampled = ds_sampled.append(sampled)
ds_sampled.reset_index(inplace=True)
ds_sampled.drop(columns=['index'], inplace=True)
return ds_sampled
# Over Sampling the Other groups dataset
size = df_others[AssignmentGroup].value_counts().max()
df_other_sampled = ResampleDataset(df_others, size, AssignmentGroup)
# Over Sampling the GRP_0 dataset
size = df[IsGroup0].value_counts().max()
df = ResampleDataset(df, size, IsGroup0)
MultiClassDataDistribution(df_other_sampled[AssignmentGroup])
In the other group dataset, GRP_8 has the maximum count, we will over sample the remaining groups to this count.
MultiClassDataDistribution(df[IsGroup0])
# Final Data after adding derived column Is_GRP_0
# This data wil be used to identify whether Assignment group is GRP_0 or Others
final_df_group0 = df.copy()
X = df[Description]
y = df[IsGroup0]
df.info()
final_df_group0.to_excel('Group-0-Data.xlsx', index=False)
# Final Data of Other GRP after removing GRP_0
final_df_others = df_other_sampled.copy()
Xs = df_other_sampled[Description]
ys = df_other_sampled[AssignmentGroup]
df_other_sampled.info()
final_df_others.to_excel('Group-Others-Data.xlsx', index=False)
We will run over models through both the original as well as the sampled dataset & try to find out which one gives good performance
from sklearn.model_selection import train_test_split
def SplitData(feature, label, size):
xtr, xtt, ytr, ytt = train_test_split(feature, label, test_size = size, stratify = label, shuffle=True)
print("Training Data Shape (X)", xtr.shape)
print("Training Data Shape (y)", ytr.shape)
print("Testing Data Shape (X)", xtt.shape)
print("Testing Data Shape (y)", ytt.shape)
return xtr, xtt, ytr, ytt
def SplitDataset(feature, label, validationSetRequired = False):
xtr, xtt, ytr, ytt = SplitData(feature, label, 0.3)
if validationSetRequired:
print("\nPost Validation Split ->")
xtr, xvl, ytr, yvl = SplitData(xtr, ytr, 0.2)
return xtr, xtt, ytr, ytt, xvl, yvl
return xtr, xtt, ytr, ytt
In order to avoid variations in the distributions while splitting Training & Testing dataset we have performed Stratification. This ensures that the datasets are not skewed and there is proper distribution of classes, in our case it is the Assignment Groups
# Group 0 Dataset
print("Group 0 Data Split ->")
X_train, X_test, y_train, y_test = SplitDataset(X, y, validationSetRequired = False)
# Other Group Dataset
print("\Other Group Data Split ->")
Xs_train, Xs_test, ys_train, ys_test = SplitDataset(Xs, ys, validationSetRequired = False)
print("Class Distribution of the Training set from Group 0 Data")
MultiClassDataDistribution(y_train)
print("Class Distribution of the Training set from Other Group Data")
MultiClassDataDistribution(ys_train)
print("Class Distribution of the Testing set from Group 0 Data")
MultiClassDataDistribution(y_test)
print("Class Distribution of the Testing set from Other Group Data")
MultiClassDataDistribution(ys_test)
The training and testing class distribution of the sampled dataset looks more uniform then the original dataset.
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
def TdIdfVectorizer(train, test, fname):
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorizer.fit(train)
num_features = len(vectorizer.get_feature_names())
print("Number of Features:", num_features)
tr = vectorizer.transform(train)
tt = vectorizer.transform(test)
wrds = len(vectorizer.vocabulary_)
print("Training Data Shape:", tr.shape)
print("Testing Data Shape:", tt.shape)
print("Total number of words in the vocabulary is", wrds)
filename = fname + '-vectorizer.pkl'
pickle.dump(vectorizer, open(filename, 'wb'))
torch.save(vectorizer, filePath + filename)
print("Vectorizer is saved as {0} \n".format(filename))
return tr, tt, wrds
print("Vectorization details of Group 0 Dataset ->")
Xtrain_tfidf, Xtest_tfidf, WORDS = TdIdfVectorizer(X_train, X_test, 'grp0')
print("\nVectorization details of Other Group Dataset ->")
Xstrain_tfidf, Xstest_tfidf, WORDS_SAMPLED = TdIdfVectorizer(Xs_train, Xs_test, 'others')
Need to encode the Target column classes i.e. Assignment Groups for model building
from sklearn.preprocessing import LabelEncoder
import pickle
def LabelEncoding(train, test, fname):
enc = LabelEncoder()
enc.fit(train)
tr = enc.transform(train)
tt = enc.transform(test)
filename = fname + '-encoder.pkl'
pickle.dump(enc, open(filename, 'wb'))
torch.save(enc, filePath + filename)
print("Label Encoder is saved as {0} \n".format(filename))
return tr, tt
ytrain_enc, ytest_enc = LabelEncoding(y_train, y_test, 'grp0')
ystrain_enc, ystest_enc = LabelEncoding(ys_train, ys_test, 'others')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from prettytable import PrettyTable
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, LeaveOneOut
import time
import pickle
df_ML_results = pd.DataFrame(columns=['Precision Score', 'Recall Score', 'Training Score', 'Testing Score', 'KFold CV', \
'F1 Score', 'Accuracy Score', 'Execution Time'],
index=['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine', 'Naive Bayes', \
'Decision Trees', 'Random Forest', 'Bagging', 'Gradient Boosting', 'XG Boosting'])
def KFoldCrossValidation (model):
kfold = StratifiedKFold(shuffle=True, random_state=RANDOM_STATE, n_splits=5)
results = cross_val_score(model, Xtrain, ytrain, cv=kfold, scoring='accuracy')
mean, std = results.mean(), results.std()
print("Cross Validation Test Accuracy:", mean * 100)
return mean * 100
def LOOCrossValidation (model):
loocv = LeaveOneOut()
results = cross_val_score(model, Xtrain, ytrain, cv=loocv, scoring='accuracy')
mean, std = results.mean(), results.std()
return mean * 100
def SearchCV(cvtype, mdl, prms):
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
if cvtype=='grid':
cv = GridSearchCV(mdl, prms, n_jobs = -1, verbose=3, cv=kfold)
elif cvtype=='random':
cv = RandomizedSearchCV(estimator = mdl, param_distributions = prms, verbose=3, n_jobs = -1, n_iter = 30, cv=kfold)
cv.fit(Xtrain, ytrain)
return cv.best_score_, cv.best_params_
START_TIME = time.time()
def FitModel(mdl):
mdl.fit(Xtrain, ytrain)
train = mdl.score(Xtrain, ytrain)
test = mdl.score(Xtest, ytest)
print("Training Data Score: {0}".format(train))
print("Testing Data Score : {0}".format(test))
#Function to execute the model passed as an argument
def ExecuteModel(model, tuningFunction = None):
global START_TIME
m=model
if tuningFunction is None:
kmean = KFoldCrossValidation(model)
START_TIME = time.time()
FitModel(m)
return m, kmean
FitModel(m)
print("--------------------Hyper Paramater tuning started--------------------")
m, scr = tuningFunction(m)
return m, scr * 100
#Function to print the Final Results of the model
def CaptureScores(model, kmean, label, fname):
pred = model.predict(Xtest)
acc = accuracy_score(ytest, pred) * 100
pr = precision_score(ytest, pred, average='weighted') * 100
rl = recall_score(ytest, pred, average='weighted') * 100
f1 = f1_score(ytest, pred, average='weighted') * 100
tr = model.score(Xtrain, ytrain) * 100
tt = model.score(Xtest, ytest) * 100
endtime = (time.time() - START_TIME)
df_ML_results.at[label, 'Accuracy Score'] = acc
df_ML_results.at[label, 'Precision Score'] = pr
df_ML_results.at[label, 'Recall Score'] = rl
df_ML_results.at[label, 'F1 Score'] = f1
df_ML_results.at[label, 'Training Score'] = tr
df_ML_results.at[label, 'Testing Score'] = tt
df_ML_results.at[label, 'KFold CV'] = kmean
# df_ML_results.at[label, 'LeaveOneOut CV'] = lmean
df_ML_results.at[label, 'Execution Time'] = endtime
x = PrettyTable()
x.field_names = ["Metrics", "Results"]
x.add_row(["Accuracy Score", acc])
x.add_row(["Precision Score", pr])
x.add_row(["Recall Score", rl])
x.add_row(["F1 Score", f1])
x.add_row(["Training Score", tr])
x.add_row(["Testing Score", tt])
x.add_row(["KFold CV", kmean])
# x.add_row(["LeaveOneOut CV", lmean])
x.add_row(["Execution Time", endtime])
print(x)
filename = fname + '.pkl'
pickle.dump(model, open(filename, 'wb'))
torch.save(model, filePath + filename)
print("Model is saved as {0} \n".format(filename))
def ExecuteAllMLModels(models):
for name, filename, model, hyptune in models:
print("{0} - Model Execution Started ------------------------------------------------------------\n".format(name))
mdl, score = ExecuteModel(model, hyptune)
CaptureScores(mdl, score, name, filename)
def LogisticRegressionTuneHyperParams(lr):
# Hyper Parameter
solvers = ['newton-cg', 'lbfgs']
# Create regularization penalty space
penalty = ['l1', 'l2']
# Create regularization hyperparameter space
C = [0.1, 1, 10, 100, 1000]
# Create hyperparameter options
param = dict(C=C, penalty=penalty, solver=solvers)
lr_bestScore, lr_bestParam = SearchCV('random', lr, param)
print("Best Accuracy after Tuning: {0}".format(lr_bestScore))
print("Best Parameter after Hyper Tuning: {0}".format(lr_bestParam))
print("--------------------Hyper Paramater tuning complete--------------------")
#Creating new model with best Parameters and running on the data again
global START_TIME
START_TIME = time.time()
C = lr_bestParam['C']
penalty = lr_bestParam['penalty']
sol=lr_bestParam['solver']
lr = LogisticRegression(C=C, penalty=penalty, solver=sol)
lr.fit(Xtrain, ytrain)
return lr, lr_bestScore
def KNNTuneHyperParams(knn):
knn = KNeighborsClassifier()
n_neighbors = range(5, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
param = dict(n_neighbors = n_neighbors, weights=weights, metric=metric)
knn_bestScore, knn_bestParam = SearchCV('random', knn, param)
print("Best Accuracy after Tuning: {0}".format(knn_bestScore))
print("Best Parameter after Hyper Tuning: {0}".format(knn_bestParam))
print("--------------------Hyper Paramater tuning complete--------------------")
#Creating new model with best Parameters and running on the data again
global START_TIME
START_TIME = time.time()
k = knn_bestParam['n_neighbors']
w = knn_bestParam['weights']
m = knn_bestParam['metric']
knn = KNeighborsClassifier(n_neighbors=k, weights=w, metric=m)
knn.fit(Xtrain, ytrain)
return knn, knn_bestScore
def SVMTuneHyperParams(svc):
svc = SVC()
kernel = ['linear', 'poly', 'rbf']
Cs = [10, 1.0, 0.1, 100]
param = dict(kernel = kernel, C = Cs)
svc_bestScore, svc_bestParam = SearchCV('random', svc, param)
print("Best Accuracy after Tuning: {0}".format(svc_bestScore))
print("Best Parameter after Hyper Tuning: {0}".format(svc_bestParam))
print("--------------------Hyper Paramater tuning complete--------------------")
#Creating new model with best Parameters and running on the data again
global START_TIME
START_TIME = time.time()
k = svc_bestParam['kernel']
C = svc_bestParam['C']
svc = SVC(kernel = k, C=C, probability=True)
svc.fit(Xtrain, ytrain)
return svc, svc_bestScore
def MultiNBTuneHyperParams(mnb):
mnb = MultinomialNB()
alpha = [0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.5, 2]
fit = [True, False]
param = dict(alpha = alpha, fit_prior = fit)
mnb_bestScore, mnb_bestParam = SearchCV('random', mnb, param)
print("Best Accuracy after Tuning: {0}".format(mnb_bestScore))
print("Best Parameter after Hyper Tuning: {0}".format(mnb_bestParam))
print("--------------------Hyper Paramater tuning complete--------------------")
#Creating new model with best Parameters and running on the data again
global START_TIME
START_TIME = time.time()
a = mnb_bestParam['alpha']
f = mnb_bestParam['fit_prior']
mnb = MultinomialNB(alpha = a, fit_prior = f)
mnb.fit(Xtrain, ytrain)
return mnb, mnb_bestScore
def DecisionTreesTuneHyperParams(dt):
dt = DecisionTreeClassifier()
minSplit = [2, 5, 8, 10, 15]
criterion = ['gini', 'entropy']
max_features = [None, 'sqrt', 'log2']
param = dict(criterion = criterion, min_samples_split = minSplit, max_features=max_features)
dt_bestScore, dt_bestParam = SearchCV('random', dt, param)
print("Best Accuracy after Tuning: {0}".format(dt_bestScore))
print("Best Parameter after Hyper Tuning: {0}".format(dt_bestParam))
print("--------------------Hyper Paramater tuning complete--------------------")
#Creating new model with best Parameters and running on the data again
global START_TIME
START_TIME = time.time()
c = dt_bestParam['criterion']
mss = dt_bestParam['min_samples_split']
maxf = dt_bestParam['max_features']
dt = DecisionTreeClassifier(criterion = c, min_samples_split = mss, max_features=maxf)
dt.fit(Xtrain, ytrain)
return dt, dt_bestScore
def RandomForestTuneHyperParams(rf):
rf = RandomForestClassifier()
est = [1000, 1500, 2000]
minSplit = [2, 5, 15]
criterion = ['gini', 'entropy']
max_features = ['sqrt', 'log2']
param = dict(n_estimators = est, min_samples_split = minSplit, criterion = criterion, max_features=max_features)
rf_bestScore, rf_bestParam = SearchCV('random', rf, param)
print("Best Accuracy after Tuning: {0}".format(rf_bestScore))
print("Best Parameter after Hyper Tuning: {0}".format(rf_bestParam))
print("--------------------Hyper Paramater tuning complete--------------------")
#Creating new model with best Parameters and running on the data again
global START_TIME
START_TIME = time.time()
e = rf_bestParam['n_estimators']
mss = rf_bestParam['min_samples_split']
c = rf_bestParam['criterion']
maxf = rf_bestParam['max_features']
rf = RandomForestClassifier(n_estimators = e, min_samples_split = mss, criterion = c, max_features=maxf)
rf.fit(Xtrain, ytrain)
return rf, rf_bestScore
def BaggingTuneHyperParams(bg):
bg = BaggingClassifier()
est = [10, 100, 500, 1000, 1500]
param = dict(n_estimators = est)
bg_bestScore, bg_bestParam = SearchCV('random', bg, param)
print("Best Accuracy after Tuning: {0}".format(bg_bestScore))
print("Best Parameter after Hyper Tuning: {0}".format(bg_bestParam))
print("--------------------Hyper Paramater tuning complete--------------------")
#Creating new model with best Parameters and running on the data again
global START_TIME
START_TIME = time.time()
e = bg_bestParam['n_estimators']
bg = BaggingClassifier(n_estimators = e, n_jobs = -1)
bg.fit(Xtrain, ytrain)
return bg, bg_bestScore
def GradientBoostingTuneHyperParams(gb):
gb = GradientBoostingClassifier()
est = [10, 100, 500, 1000]
rate = [0.001, 0.01, 0.1]
minSplit = [2, 5]
param = dict(n_estimators = est, learning_rate = rate, max_depth = maxDepth, min_samples_split = minSplit)
gb_bestScore, gb_bestParam = SearchCV('random', gb, param)
print("Best Accuracy after Tuning: {0}".format(gb_bestScore))
print("Best Parameter after Hyper Tuning: {0}".format(gb_bestParam))
print("--------------------Hyper Paramater tuning complete--------------------")
#Creating new model with best Parameters and running on the data again
global START_TIME
START_TIME = time.time()
e = gb_bestParam['n_estimators']
lr = gb_bestParam['learning_rate']
md = gb_bestParam['max_depth']
mss = gb_bestParam['min_samples_split']
gb = GradientBoostingClassifier(n_estimators = e, learning_rate = lr, max_depth = md, min_samples_split = mss)
gb.fit(Xtrain, ytrain)
return gb, gb_bestScore
Xtrain = Xtrain_tfidf
Xtest = Xtest_tfidf
ytrain = ytrain_enc
ytest = ytest_enc
models = []
models.append(('Logistic Regression', 'grp0-lr-model', LogisticRegression(), LogisticRegressionTuneHyperParams))
models.append(('K-Nearest Neighbors', 'grp0-knn-model', KNeighborsClassifier(), KNNTuneHyperParams))
models.append(('Support Vector Machine', 'grp0-svm-model', SVC(probability=True), SVMTuneHyperParams))
models.append(('Naive Bayes', 'grp0-mnb-model', MultinomialNB(), MultiNBTuneHyperParams))
models.append(('Decision Trees', 'grp0-dt-model', DecisionTreeClassifier(), DecisionTreesTuneHyperParams))
models.append(('Random Forest', 'grp0-rf-model', RandomForestClassifier(n_estimators=1000, min_samples_split=5, max_features='log2', criterion='entropy'), None))
models.append(('Bagging', 'grp0-bgg-model', BaggingClassifier(n_estimators= 100), None))
models.append(('Gradient Boosting', 'grp0-gb-model', GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.1, min_samples_split = 2), None))
models.append(('XG Boosting', 'grp0-xgb-model', XGBClassifier(), None))
ExecuteAllMLModels(models)
df_ML_Group0 = df_ML_results.copy()
df_ML_Group0
Xtrain = Xstrain_tfidf
Xtest = Xstest_tfidf
ytrain = ystrain_enc
ytest = ystest_enc
df_ML_results = pd.DataFrame(columns=['Precision Score', 'Recall Score', 'Training Score', 'Testing Score', 'KFold CV', \
'F1 Score', 'Accuracy Score', 'Execution Time'],
index=['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine', 'Naive Bayes', \
'Decision Trees', 'Random Forest', 'Bagging', 'Gradient Boosting', 'XG Boosting'])
models = []
models.append(('Logistic Regression', 'others-lr-model', LogisticRegression(solver='lbfgs', penalty='l2', C=100), None))
models.append(('K-Nearest Neighbors', 'others-knn-model', KNeighborsClassifier(weights='distance', n_neighbors=5, metric='euclidean'), None))
models.append(('Support Vector Machine', 'others-svm-model', SVC(kernel='linear', C=100, probability=True), None))
models.append(('Naive Bayes', 'others-mnb-model', MultinomialNB(), MultiNBTuneHyperParams))
models.append(('Decision Trees', 'others-dt-model', DecisionTreeClassifier(), DecisionTreesTuneHyperParams))
models.append(('Random Forest', 'others-rf-model', RandomForestClassifier(n_estimators=100, min_samples_split=5, max_features='sqrt', criterion='gini'), None))
models.append(('Bagging', 'others-bgg-model', BaggingClassifier(n_estimators= 100), None))
models.append(('Gradient Boosting', 'others-gb-model', GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, min_samples_split = 2), None))
models.append(('XG Boosting', 'others-xgb-model', XGBClassifier(), None))
ExecuteAllMLModels(models)
df_ML_Others = df_ML_results.copy()
df_ML_Others
# Original Dataset
print("Original Data Split ->")
X_train, X_test, y_train, y_test, X_val, y_val = SplitDataset(X, y, validationSetRequired = True)
# Sampled Dataset
print("\nSampled Data Split ->")
Xs_train, Xs_test, ys_train, ys_test, Xs_val, ys_val = SplitDataset(Xs, ys, validationSetRequired = True)
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
# Vectorization parameters
# Limit on the number of features.
MAX_FEATURES = 10000
# Limit on the length of text sequences. Sequences longer than this will be truncated.
MAX_SEQUENCE_LENGTH = 300
# Size of the Embedding Vector
EMBEDDING_SIZE = 300
def sequence_vectorize(train_texts, test_texts, val_texts):
# Create vocabulary with training texts.
tokenizer = text.Tokenizer(num_words=MAX_FEATURES, oov_token='<UNK>')
tokenizer.fit_on_texts(train_texts)
# Vectorize training, testing and validation texts.
x_train = tokenizer.texts_to_sequences(train_texts)
x_test = tokenizer.texts_to_sequences(test_texts)
x_val = tokenizer.texts_to_sequences(val_texts)
# Get max sequence length.
max_length = len(max(x_train, key=len))
if max_length > MAX_SEQUENCE_LENGTH:
max_length = MAX_SEQUENCE_LENGTH
# Fix sequence length to max value. Sequences shorter than the length are
# padded in the beginning and sequences longer are truncated
# at the beginning.
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)
x_val = sequence.pad_sequences(x_val, maxlen=max_length)
# Number of words
num_words = len(tokenizer.word_index) + 1
print("Training Data Shape:", x_train.shape)
print("Testing Data Shape:", x_test.shape)
print("Validation Data Shape:", x_val.shape)
print("Total number of words in the vocabulary is", num_words)
return x_train, x_test, x_val, tokenizer.word_index, num_words
print("Group 0 Dataset Attributes ->")
X_train_seq, X_test_seq, X_val_seq, Word_Index, WORDS = sequence_vectorize(X_train, X_test, X_val)
print("\nOther Group Dataset Attributes ->")
Xs_train_seq, Xs_test_seq, Xs_val_seq, Word_Index_Sampled, WORDS_SAMPLED = sequence_vectorize(Xs_train, Xs_test, Xs_val)
Need to encode the Target column classes i.e. Assignment Groups for model building
from sklearn.preprocessing import LabelEncoder
def LabelEncoding(train, test, val):
enc = LabelEncoder()
enc.fit(train)
tr = enc.transform(train)
tt = enc.transform(test)
vl = enc.transform(val)
return tr, tt, vl
ytrain_enc, ytest_enc, yval_enc = LabelEncoding(y_train, y_test, y_val)
ystrain_enc, ystest_enc, ysval_enc = LabelEncoding(ys_train, ys_test, ys_val)
EMBEDDING_FILE = datasetPath + 'glove.6B.300d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE):
word = o.split(" ")[0]
# print(word)
embd = o.split(" ")[1:]
embd = np.asarray(embd, dtype='float32')
# print(embd)
embeddings[word] = embd
# create a weight matrix for words
def get_embedding_matrix(num_words, word_index):
matrix = np.zeros((num_words, EMBEDDING_SIZE))
for word, i in word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
matrix[i] = embedding_vector
return matrix
import tensorflow as tf
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from prettytable import PrettyTable
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D, GRU, Conv1D, MaxPooling1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import regularizers, optimizers
import operator
import time
df_DL_results = pd.DataFrame(columns = ['Training Loss', 'Training Accuracy', 'Val. Loss', 'Val. Accuracy', \
'Testing Loss', 'Testing Accuracy'],
index=['LSTM', 'Bi-LSTM', 'GRU'])
df_DL_Scores = pd.DataFrame(columns = ['Precision Score', 'Recall Score', 'F1 Score', 'Accuracy Score', 'Execution Time'],
index=['LSTM', 'Bi-LSTM', 'GRU'])
DL_results = pd.DataFrame(columns = ['Name', 'Accuracy'])
NUM_CLASSES = 0
ACTIVATION = ''
LOSS = ''
def DetermineParameters():
num_classes = pd.unique(ytrain).shape[0]
if num_classes == 2:
return 1, 'sigmoid', 'binary_crossentropy'
else:
return num_classes, 'softmax', 'sparse_categorical_crossentropy'
def ExecuteDLModel(batch_size, nodes, epoch, model_method, label):
global START_TIME
START_TIME = time.time()
matrix = get_embedding_matrix(TOTAL_WORDS, WORD_INDEX)
m = model_method(nodes, TOTAL_WORDS, matrix, Xtrain.shape[1])
return FitModel(m, batch_size, epoch, label)
def CompileModel(m):
opt = optimizers.Adam(learning_rate=0.01)
m.compile(loss=LOSS, optimizer=opt, metrics=['accuracy'])
return m
def FitModel(m, batch_size, epoch, label):
m = CompileModel(m)
# Early Stop the training if there is no improvement in validation accuracy for the next 5 epochs
early_stop = EarlyStopping(monitor='val_accuracy', patience=15)
# Reducing learning rate by factor of 0.2 if no improvement is seen in validation loss for 3 epochs
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.8, patience=3)
# Model Checkpoints the best validation accuracy achieved by the model
filepath = 'DL_' + label + "_weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
op = m.fit(Xtrain, ytrain, epochs=epoch, validation_data=(Xval, yval), verbose = 1, batch_size = batch_size, callbacks = [early_stop, reduce_lr, checkpoint])
PlotTrainingResults(op.history)
return m, op.history
def CaptureResults(mdl, batch_size, label, newlabel):
global DL_results
# Loading the best weight of the model for evaluation
mdl.load_weights('DL_' + label + "_weights.best.hdf5")
mdl = CompileModel(mdl)
# Evaluate the model on Test Data
loss, test_acc = mdl.evaluate(Xtest, ytest, verbose = 2, batch_size = batch_size)
new_row = {'Name':newlabel, 'Accuracy':test_acc * 100}
#append row to the dataframe
DL_results = DL_results.append(new_row, ignore_index=True)
def CaptureMetrics(mdl, res, batch_size, label):
# Loading the best weight of the model for evaluation
mdl.load_weights('DL_' + label + "_weights.best.hdf5")
mdl = CompileModel(mdl)
# Evaluate the model on Test Data
loss, test_acc = mdl.evaluate(Xtest, ytest, verbose = 2, batch_size = batch_size)
test_acc = test_acc * 100
# Model Prediction
pred_probs = mdl.predict(Xtest, verbose=0)
# predict crisp classes for test set
pred_classes = mdl.predict_classes(Xtest, verbose=0)
# reduce to 1d array
pred_probs = pred_probs[:, 0]
if pred_classes.ndim > 1:
pred_classes = pred_classes[:, 0]
pred_acc = accuracy_score(ytest, pred_classes) * 100
pre = precision_score(ytest, pred_classes, average='weighted') * 100
rcl = recall_score(ytest, pred_classes, average='weighted') * 100
f1 = f1_score(ytest, pred_classes, average='weighted') * 100
endtime = (time.time() - START_TIME)
x = PrettyTable()
x.field_names = ["Metrics", "Results"]
x.add_row(["Testing Loss", loss])
x.add_row(["Testing Accuracy", test_acc])
x.add_row(["Accuracy Score", pred_acc])
x.add_row(["Precision Score", pre])
x.add_row(["Recall Score", rcl])
x.add_row(["F1 Score", f1])
x.add_row(["Execution Time", endtime])
print(x)
# Store results
idx, val_acc = max(enumerate(res['val_accuracy']), key=operator.itemgetter(1))
df_DL_results.at[label, 'Training Loss'] = res['loss'][idx]
df_DL_results.at[label, 'Training Accuracy'] = res['accuracy'][idx] * 100
df_DL_results.at[label, 'Val. Loss'] = res['val_loss'][idx]
df_DL_results.at[label, 'Val. Accuracy'] = val_acc * 100
df_DL_results.at[label, 'Testing Loss'] = loss
df_DL_results.at[label, 'Testing Accuracy'] = test_acc
df_DL_Scores.at[label, 'Accuracy Score'] = pred_acc
df_DL_Scores.at[label, 'Precision Score'] = pre
df_DL_Scores.at[label, 'Recall Score'] = rcl
df_DL_Scores.at[label, 'F1 Score'] = f1
df_DL_Scores.at[label, 'Execution Time'] = endtime
def PlotTrainingResults(data):
plt.figure(figsize=(15,5))
# summarize history for accuracy
plt.subplot(1, 2, 1)
plt.plot(data['accuracy'])
plt.plot(data['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
# summarize history for loss
plt.subplot(1, 2, 2)
plt.plot(data['loss'])
plt.plot(data['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()
def LSTMModel(nodes, words, matrix, ip_len):
model = Sequential()
model.add(Embedding(words, EMBEDDING_SIZE, input_length=ip_len, weights=[matrix], trainable=False))
model.add(LSTM(nodes, return_sequences='false'))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(NUM_CLASSES, activation = ACTIVATION))
model.summary()
return model
def BiDirectionalLSTMModel(nodes, words, matrix, ip_len):
model = Sequential()
model.add(Embedding(words, EMBEDDING_SIZE, input_length=ip_len, weights=[matrix], trainable=False))
model.add(Bidirectional(LSTM(nodes, return_sequences='false')))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(NUM_CLASSES, activation = ACTIVATION))
model.summary()
return model
def GRUModel(nodes, words, matrix, ip_len):
model = Sequential()
model.add(Embedding(words, EMBEDDING_SIZE, input_length=ip_len, weights=[matrix], trainable=False))
model.add(GRU(nodes, return_sequences='false'))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(NUM_CLASSES, activation = ACTIVATION))
model.summary()
return model
Xtrain = X_train_seq
Xtest = X_test_seq
Xval = X_val_seq
ytrain = ytrain_enc
ytest = ytest_enc
yval = yval_enc
TOTAL_WORDS = WORDS
WORD_INDEX = Word_Index
EPOCH = 50
NUM_CLASSES, ACTIVATION, LOSS = DetermineParameters()
BATCH = [128]
NODES = [128]
def ExecuteAllDLModels():
for batch in BATCH:
for node in NODES:
newlabel = "LSTM-Batch-" + str(batch) + "-Nodes-" + str(node)
print("\n--------------------" + newlabel + "--------------------\n")
label = 'LSTM'
model, data = ExecuteDLModel(node, batch, EPOCH, LSTMModel, label)
CaptureResults(model, batch, label, newlabel)
CaptureMetrics(model, data, batch, label)
newlabel = "Bi-LSTM-Batch-" + str(batch) + "-Nodes-" + str(node)
print("\n--------------------" + newlabel + "--------------------\n")
label = 'Bi-LSTM'
model, data = ExecuteDLModel(node, batch, EPOCH, BiDirectionalLSTMModel, label)
CaptureResults(model, batch, label, newlabel)
CaptureMetrics(model, data, batch, label)
newlabel = "GRU-Batch-" + str(batch) + "-Nodes-" + str(node)
print("\n--------------------" + newlabel + "--------------------\n")
label = 'GRU'
model, data = ExecuteDLModel(node, batch, EPOCH, GRUModel, label)
CaptureResults(model, batch, label, newlabel)
CaptureMetrics(model, data, batch, label)
print(DL_results)
ExecuteAllDLModels()
df_DL_Group0 = df_DL_results.copy()
df_DL_Group0
df_DL_Scores_Group0 = df_DL_Scores.copy()
df_DL_Scores_Group0
Xtrain = Xs_train_seq
Xtest = Xs_test_seq
Xval = Xs_val_seq
ytrain = ystrain_enc
ytest = ystest_enc
yval = ysval_enc
TOTAL_WORDS = WORDS_SAMPLED
WORD_INDEX = Word_Index_Sampled
EPOCH = 50
NUM_CLASSES, ACTIVATION, LOSS = DetermineParameters()
df_DL_results = pd.DataFrame(columns = ['Training Loss', 'Training Accuracy', 'Val. Loss', 'Val. Accuracy', \
'Testing Loss', 'Testing Accuracy'],
index=['LSTM', 'Bi-LSTM', 'GRU'])
df_DL_Scores = pd.DataFrame(columns = ['Precision Score', 'Recall Score', 'F1 Score', 'Accuracy Score', 'Execution Time'],
index=['LSTM', 'Bi-LSTM', 'GRU'])
DL_results = pd.DataFrame(columns = ['Name', 'Accuracy'])
BATCH = [128]
NODES = [128]
ExecuteAllDLModels()
df_DL_Others = df_DL_results.copy()
df_DL_Others
df_DL_Scores_Others = df_DL_Scores.copy()
df_DL_Scores_Others
def SetHorizontalBarValue(ax):
# create a list to collect the plt.patches data
totals = []
# find the values and append to list
for i in ax.patches:
totals.append(i.get_width())
# set individual bar lables using above list
total = sum(totals)
# set individual bar lables using above list
for i in ax.patches:
# get_width pulls left or right; get_y pushes up or down
ax.text(i.get_width()+0.01, i.get_y()+.2, str(str(round(i.get_width(), 2))))
# invert for largest on top
ax.invert_yaxis()
def PlotResults():
fig, axes = plt.subplots(1, 2, figsize=(21, 12))
acc = df_Scores.reset_index().plot(ax=axes[0],
x="index", y=['Accuracy Score', 'F1 Score'], kind="barh"
)
SetHorizontalBarValue(acc)
axes[0].set_title("Model Performance")
axes[0].set_xlabel("Metrics (%)")
axes[0].set_ylabel("Models")
axes[0].legend(loc='center right', bbox_to_anchor=(1.5, 0.5))
exectime = df_Scores.reset_index().plot(ax=axes[1],
x="index", y=['Execution Time'], kind="barh"
)
SetHorizontalBarValue(exectime)
axes[1].set_title("Model Execution")
axes[1].set_xlabel("Execution Time (seconds)")
axes[1].set_ylabel("Models")
plt.tight_layout()
df_Scores = pd.concat([df_ML_Group0[['Accuracy Score', 'Precision Score', 'Recall Score', 'F1 Score', 'Execution Time']], df_DL_Scores_Group0])
df_Scores
PlotResults()
df_Scores = pd.concat([df_ML_Others[['Accuracy Score', 'Precision Score', 'Recall Score', 'F1 Score', 'Execution Time']], df_DL_Scores_Others])
df_Scores
PlotResults()
import random
def GetRandomIntegers(limit):
if limit > 10:
limit = 10
random_list=[]
for i in range(50):
if len(random_list) == limit:
break
r = random.randint(1,19)
if r not in random_list: random_list.append(r)
return random_list
def PredictAssignmentGroup(text, grp0_model, others_model):
grp0_vec = pickle.load(open('grp0-vectorizer.pkl', 'rb'))
vec = grp0_vec.transform([text])
pred_class = grp0_model.predict(vec)
grp0_enc = pickle.load(open('grp0-encoder.pkl', 'rb'))
group = grp0_enc.inverse_transform(pred_class)[0]
if group != 'GRP_0':
others_vec = pickle.load(open('others-vectorizer.pkl', 'rb'))
vec = others_vec.transform([text])
pred_class = others_model.predict(vec)
others_enc = pickle.load(open('others-encoder.pkl', 'rb'))
group = others_enc.inverse_transform(pred_class)[0]
return group
def GetPredictions(num, test_data, mdl1, mdl2):
# Create separate dataframe to hold the results
predictions = pd.DataFrame(columns = ['Description', 'Assignment Group', 'Predicted Group'])
# Loop through the data and take one sample after every 20 step starting with random number num
for i in range(num, test_data.shape[0], 20):
grp = PredictAssignmentGroup(test_data[Description][i], mdl1, mdl2)
res = {'Description' : test_data[Description][i], 'Assignment Group': test_data[AssignmentGroup][i], 'Predicted Group': grp}
predictions = predictions.append(res, ignore_index=True)
fname = 'Test-Data-Predictions-' + str(num) + '.xlsx'
predictions.to_excel(fname)
return predictions
def CalculateAccuracy(test_results):
# Check whether Actual & Predicted values are same or not
rows = test_results.apply(lambda x : True if x['Assignment Group'] == x['Predicted Group'] else False, axis = 1)
# Count number of True in the series
num_rows = len(rows[rows == True].index)
return (num_rows / test_results.shape[0]) * 100
from statistics import mean
# Concatenate test data split which is untouched for predictions
test = pd.concat([X_test, y_test], axis=1)
# Create test data using the indexes of X_test & y_test
test_data = final_df_group0.iloc[list(test.index.values)]
test_data = test_data.reset_index(drop=True)
# Load the best models saved earlier for Predictions
grp0_model = torch.load(filePath + 'grp0-svm-model.pkl')
others_model = torch.load(filePath + 'others-lr-model.pkl')
predictions = []
# Generate random numbers for taking samples from test data and loop through it
nos = GetRandomIntegers(10)
m=0
for no in nos:
m=m+1
results = GetPredictions(no, test_data, grp0_model, others_model)
acc = CalculateAccuracy(results)
predictions.append(acc)
print("Prediction Accuracy of Set {0} : {1}".format(str(m), acc))
print("Mean Accuracy of the Test Data Prediction is", mean(predictions))
Automatic Ticket Assignment problem statement is a classical case of Multi-Class text classification problems. To achieve the goal of building a Classifier that can classify the tickets we have implemented the below steps